describe(a3)
hist(a$Score)
hist(a$nwords)
hist(a$b_concreteness)
hist(a$mcd)
hist(a$usf)
cor.test(a$Score,a$b_concreteness)
##
## Pearson's product-moment correlation
##
## data: a$Score and a$b_concreteness
## t = -0.17945, df = 478, p-value = 0.8577
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.09763610 0.08135285
## sample estimates:
## cor
## -0.008207364
cor.test(a$Score,a$mcd)
##
## Pearson's product-moment correlation
##
## data: a$Score and a$mcd
## t = 8.5114, df = 478, p-value = 2.236e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2824510 0.4380577
## sample estimates:
## cor
## 0.3627806
cor.test(a$Score,a$usf)
##
## Pearson's product-moment correlation
##
## data: a$Score and a$usf
## t = -7.7064, df = 478, p-value = 7.542e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4097460 -0.2503862
## sample estimates:
## cor
## -0.3324368
hist(a$v_nsubj_deltap_govcue)
hist(a$v_nsubj_deltap_depcue)
hist(a$v_nsubj_deltap_strgst)
hist(a$v_nsubj_MI)
hist(a$v_nsubj_T)
hist(a$v_dobj_deltap_govcue) #heavily skewed (right tail)
hist(a$v_dobj_deltap_depcue)
hist(a$v_dobj_deltap_strgst) #looks the nicest
hist(a$v_dobj_MI)
hist(a$v_dobj_T)
hist(a$v_advmod_deltap_govcue)
hist(a$v_advmod_deltap_depcue)
hist(a$v_advmod_deltap_strgst)
hist(a$v_advmod_MI)
hist(a$v_advmod_T)
hist(a$n_amod_deltap_govcue,breaks = 50) # some outliers
hist(a$n_amod_deltap_depcue,breaks = 50)
hist(a$n_amod_deltap_strgst,breaks = 50)
hist(a$n_amod_MI,breaks = 50)
hist(a$n_amod_T,breaks = 50)
refined normal indices using python script…
varnames <- rbind("McD CD", "USF CD", "Verb—Nsubject DeltaP Dep", "Verb—Dobj (MI)", "Verb—Advmod Delta P Strongest", "Noun—Amod (MI)", "Adjective Frequency (Logged)", "Adverb Frequency (Logged)", "Main verb Frequency (Logged)", "Content word lemma Frequency", "Lemma bigram DeltaP Strongest")
IndexCategory <- rbind("Contextual distinctiveness", "Contextual distinctiveness", "Dependency bigram", "Dependency bigram", "Dependency bigram", "Dependency bigram", "Word Frequency", "Word Frequency", "Word Frequency", "Word Frequency", "Bigram")
#desc <- describe(a2[,2:12])
#desc$variable <- varnames
#write.csv(desc, "descriptives.csv")
#corr <- as.data.frame(cor(a2[,2:12]))
##corr$variable <- varnames
#write.csv(corr, "correlations.csv")
correlation <- corr.test(y = a2$Score, x = a2[,2:12], method = "pearson")
cor_result2 <- as.data.frame(cbind(varnames, IndexCategory,correlation$r, correlation$se))
cor_result <- cbind(varnames, IndexCategory,print(correlation, short = F, digit = 3))
## Call:corr.test(x = a2[, 2:12], y = a2$Score, method = "pearson")
## Correlation matrix
## [,1]
## mcd 0.363
## usf -0.332
## v_nsubj_deltap_depcue 0.133
## v_dobj_MI 0.224
## v_advmod_deltap_strgst 0.261
## n_amod_MI 0.254
## amod_freq_log -0.208
## adv_manner_freq_log 0.134
## lex_mverb_freq_log -0.129
## cw_lemma_freq_log -0.193
## lemm_bg_deltap_strgst 0.174
## Sample Size
## [1] 480
## Probability values adjusted for multiple tests.
## [,1]
## mcd 0.000
## usf 0.000
## v_nsubj_deltap_depcue 0.010
## v_dobj_MI 0.000
## v_advmod_deltap_strgst 0.000
## n_amod_MI 0.000
## amod_freq_log 0.000
## adv_manner_freq_log 0.010
## lex_mverb_freq_log 0.010
## cw_lemma_freq_log 0.000
## lemm_bg_deltap_strgst 0.001
##
## Confidence intervals based upon normal theory. To get bootstrapped values, try cor.ci
## raw.lower raw.r raw.upper raw.p lower.adj upper.adj
## mcd-NA 0.282 0.363 0.438 0.000 0.245 0.470
## usf-NA -0.410 -0.332 -0.250 0.000 -0.441 -0.214
## v_n__-NA 0.044 0.133 0.220 0.010 0.032 0.233
## v__MI-NA 0.137 0.224 0.307 0.000 0.104 0.337
## v_d__-NA 0.176 0.261 0.342 0.000 0.139 0.375
## n__MI-NA 0.168 0.254 0.336 0.000 0.133 0.367
## amd__-NA -0.292 -0.208 -0.121 0.000 -0.320 -0.090
## ad___-NA 0.045 0.134 0.221 0.010 0.025 0.239
## lx___-NA -0.216 -0.129 -0.040 0.010 -0.216 -0.040
## cw___-NA -0.277 -0.193 -0.105 0.000 -0.303 -0.077
## lm___-NA 0.086 0.174 0.259 0.001 0.061 0.282
cor_result$cor_abs <- abs(cor_result$raw.r)
ggplot(cor_result, aes(x = reorder(varnames, cor_abs), y = raw.r, family = "serif", fill = IndexCategory, shape = IndexCategory), ymax = .5, ymin = -.5)+
geom_bar( stat = "identity", width = .7) +
geom_pointrange(aes(x = reorder(varnames, cor_abs), y = raw.r, ymin = raw.lower, ymax = raw.upper, width = .15)) +
ylim(-1, 1) +
geom_text(aes(y = -.79, label = raw.r, size = 1.7), hjust = "outward", family = "serif") +
coord_flip() +
theme_bw()+
labs(x = "Lexical and phraseological indices", y = "Pearson Correlation Coefficients") +
theme(legend.position="bottom") +
scale_size(guide = 'none')
## Warning: Ignoring unknown aesthetics: width
#create model, use dredge for all subsets.
full_mod <- lm(Score ~ .,data = a3)
summary(full_mod)
##
## Call:
## lm(formula = Score ~ ., data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.51442 -0.55351 0.00617 0.53979 1.98805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.427083 0.034822 98.416 < 2e-16 ***
## mcd 2.052053 0.476389 4.308 2.01e-05 ***
## usf -0.018305 0.005345 -3.425 0.00067 ***
## v_nsubj_deltap_depcue -0.104989 1.032914 -0.102 0.91908
## v_dobj_MI 0.298912 0.069814 4.282 2.25e-05 ***
## v_advmod_deltap_strgst 7.409734 1.687236 4.392 1.39e-05 ***
## n_amod_MI 0.180960 0.056137 3.224 0.00135 **
## amod_freq_log -0.122507 0.049777 -2.461 0.01421 *
## adv_manner_freq_log 0.025192 0.012963 1.943 0.05257 .
## lex_mverb_freq_log 0.231498 0.108740 2.129 0.03378 *
## cw_lemma_freq_log 0.267065 0.180847 1.477 0.14042
## lemm_bg_deltap_strgst 1.154843 2.436164 0.474 0.63569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7629 on 468 degrees of freedom
## Multiple R-squared: 0.2774, Adjusted R-squared: 0.2604
## F-statistic: 16.33 on 11 and 468 DF, p-value: < 2.2e-16
options(na.action = "na.fail")
drge <- dredge(full_mod)
## Fixed term is "(Intercept)"
sel_models_aic <- model.sel(drge)
sel_models_aic[1:10]
sel_models_bic <- model.sel(drge, rank = BIC)
## New rank 'BIC' applied to logLik objects
sel_models_bic[1:10]
best_mod_aic <- lm(Score ~ adv_manner_freq_log+amod_freq_log+cw_lemma_freq_log + a2$lex_mverb_freq_log +mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_aic)
##
## Call:
## lm(formula = Score ~ adv_manner_freq_log + amod_freq_log + cw_lemma_freq_log +
## a2$lex_mverb_freq_log + mcd + n_amod_MI + usf + v_advmod_deltap_strgst +
## v_dobj_MI, data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.52236 -0.53524 0.00208 0.54040 1.98528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.022047 1.149126 0.889 0.374236
## adv_manner_freq_log 0.025578 0.012907 1.982 0.048097 *
## amod_freq_log -0.123154 0.049557 -2.485 0.013299 *
## cw_lemma_freq_log 0.280646 0.178012 1.577 0.115569
## a2$lex_mverb_freq_log 0.225877 0.107875 2.094 0.036805 *
## mcd 2.093641 0.453211 4.620 4.97e-06 ***
## n_amod_MI 0.180348 0.055818 3.231 0.001320 **
## usf -0.018370 0.005305 -3.463 0.000584 ***
## v_advmod_deltap_strgst 7.505786 1.664943 4.508 8.27e-06 ***
## v_dobj_MI 0.300919 0.069547 4.327 1.85e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7615 on 470 degrees of freedom
## Multiple R-squared: 0.277, Adjusted R-squared: 0.2632
## F-statistic: 20.01 on 9 and 470 DF, p-value: < 2.2e-16
best_mod_bic <- lm(Score ~ lex_mverb_freq_log +mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_bic)
##
## Call:
## lm(formula = Score ~ lex_mverb_freq_log + mcd + n_amod_MI + usf +
## v_advmod_deltap_strgst + v_dobj_MI, data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.10796 -0.51796 0.01449 0.51518 2.12062
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.427083 0.035035 97.819 < 2e-16 ***
## lex_mverb_freq_log 0.319191 0.093319 3.420 0.000679 ***
## mcd 2.097248 0.430706 4.869 1.53e-06 ***
## n_amod_MI 0.214454 0.054412 3.941 9.33e-05 ***
## usf -0.017850 0.005167 -3.455 0.000600 ***
## v_advmod_deltap_strgst 7.301149 1.670503 4.371 1.52e-05 ***
## v_dobj_MI 0.309273 0.070054 4.415 1.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7676 on 473 degrees of freedom
## Multiple R-squared: 0.2607, Adjusted R-squared: 0.2513
## F-statistic: 27.8 on 6 and 473 DF, p-value: < 2.2e-16
best_mod_bic2 <- lm(Score ~ mcd + n_amod_MI+ usf + v_advmod_deltap_strgst+v_dobj_MI, data = a3)
summary(best_mod_bic2)
##
## Call:
## lm(formula = Score ~ mcd + n_amod_MI + usf + v_advmod_deltap_strgst +
## v_dobj_MI, data = a3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2453 -0.5272 0.0060 0.5431 2.0018
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.427083 0.035428 96.733 < 2e-16 ***
## mcd 1.338130 0.373266 3.585 0.000372 ***
## n_amod_MI 0.205839 0.054964 3.745 0.000203 ***
## usf -0.017839 0.005225 -3.414 0.000695 ***
## v_advmod_deltap_strgst 7.681274 1.685509 4.557 6.6e-06 ***
## v_dobj_MI 0.273038 0.070026 3.899 0.000110 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7762 on 474 degrees of freedom
## Multiple R-squared: 0.2424, Adjusted R-squared: 0.2344
## F-statistic: 30.33 on 5 and 474 DF, p-value: < 2.2e-16
BIC(best_mod_bic2)
## [1] 1156.138
tab_model(best_mod_bic2, show.std = T, show.se = T, show.aic = T, col.order = c("est", "se", "p", "std.est", "std.ci", "ci.inner", "ci.outer", "stat", "df",
"response.level"), digits = 3)
| Â | Score | ||||
|---|---|---|---|---|---|
| Predictors | Estimates | std. Error | p | std. Beta | standardized CI |
| (Intercept) | 3.427 | 0.035 | <0.001 | -0.000 | -0.078 – 0.078 |
| mcd | 1.338 | 0.373 | <0.001 | 0.177 | 0.080 – 0.274 |
| n_amod_MI | 0.206 | 0.055 | <0.001 | 0.154 | 0.073 – 0.235 |
| usf | -0.018 | 0.005 | 0.001 | -0.165 | -0.260 – -0.070 |
| v_advmod_deltap_strgst | 7.681 | 1.686 | <0.001 | 0.187 | 0.106 – 0.267 |
| v_dobj_MI | 0.273 | 0.070 | <0.001 | 0.159 | 0.079 – 0.239 |
| Observations | 480 | ||||
| R2 / R2 adjusted | 0.242 / 0.234 | ||||
| AIC | 1126.922 | ||||
calc.relimp(best_mod_bic2)
## Response variable: Score
## Total response variance: 0.7869476
## Analysis based on 480 observations
##
## 5 Regressors:
## mcd n_amod_MI usf v_advmod_deltap_strgst v_dobj_MI
## Proportion of variance explained by model: 24.24%
## Metrics are not normalized (rela=FALSE).
##
## Relative importance metrics:
##
## lmg
## mcd 0.06727746
## n_amod_MI 0.03833927
## usf 0.05654548
## v_advmod_deltap_strgst 0.04631796
## v_dobj_MI 0.03392807
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## mcd 2.7425412 2.3463513 1.98241922 1.64750105
## n_amod_MI 0.3391793 0.2857634 0.24860956 0.22333768
## usf -0.0359896 -0.0298037 -0.02483311 -0.02089771
## v_advmod_deltap_strgst 10.7335938 9.3988633 8.52962411 7.98720763
## v_dobj_MI 0.3844653 0.3341693 0.30241797 0.28324401
## 5Xs
## mcd 1.33812951
## n_amod_MI 0.20583856
## usf -0.01783859
## v_advmod_deltap_strgst 7.68127362
## v_dobj_MI 0.27303775
plot_model(best_mod_bic2, type = 'pred')
## $mcd
##
## $n_amod_MI
##
## $usf
##
## $v_advmod_deltap_strgst
##
## $v_dobj_MI
plot(best_mod_bic2)
1/vif(best_mod_bic2)
## mcd n_amod_MI usf
## 0.6556000 0.9462629 0.6861909
## v_advmod_deltap_strgst v_dobj_MI
## 0.9519354 0.9596180